{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "bcef2c1f-7202-44b0-9059-5c537600988d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'name': 'liuxgm.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'SXGzrN4dSXW1t0pkWXGfjg', 'version': {'number': '8.11.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': 'd9ec3fa628c7b0ba3d25692e277ba26814820b20', 'build_date': '2023-11-04T10:04:57.184859352Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n"
     ]
    }
   ],
   "source": [
    "from elasticsearch import Elasticsearch\n",
    "import os\n",
    "\n",
    "elastic_user=os.getenv('ES_USER')\n",
    "elastic_password=os.getenv('ES_PASSWORD')\n",
    "elastic_endpoint=os.getenv(\"ES_ENDPOINT\")\n",
    " \n",
    "url = f\"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200\"\n",
    "es = Elasticsearch(url, ca_certs = \"./http_ca.crt\", verify_certs = True)\n",
    " \n",
    "print(es.info())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4984711d-b58b-4834-ab38-9cfa71bf5a6d",
   "metadata": {},
   "source": [
    "# Create Ingestion pipeline with lowercase"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c417d523-5cfb-4555-aa22-1f2e4c4e3630",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True})"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "es.ingest.put_pipeline(\n",
    "    id=\"ingest-pipeline-lowercase\", \n",
    "    description=\"Ingest pipeline to change title to lowercase\",\n",
    "    processors=[\n",
    "    {\n",
    "      \"lowercase\": {\n",
    "        \"field\": \"title\"\n",
    "      }\n",
    "    }\n",
    "  ]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "05933e16-47c8-449d-94ec-66d041d7e98b",
   "metadata": {},
   "source": [
    "# Create index - movies with mappings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4dfb7489-fa9e-4600-aadf-0cf0378454cd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'movies'})"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "es.indices.delete(index=\"movies\",ignore_unavailable=True)\n",
    "es.indices.create(\n",
    "  index=\"movies\",\n",
    "  settings={\n",
    "      \"index\": {\n",
    "          \"number_of_shards\": 1,\n",
    "          \"number_of_replicas\": 1,\n",
    "          \"default_pipeline\": \"ingest-pipeline-lowercase\"\n",
    "      }\n",
    "  },\n",
    "  mappings={\n",
    "    \"properties\": {\n",
    "      \"plot\": {\n",
    "        \"type\": \"text\",\n",
    "        \"fields\": {\n",
    "          \"keyword\": {\n",
    "            \"type\": \"keyword\",\n",
    "            \"ignore_above\": 256\n",
    "          }\n",
    "        }\n",
    "      },\n",
    "    }\n",
    "  }\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c63447ac-f10a-46e8-afc0-4bfc08c3b166",
   "metadata": {},
   "source": [
    "# Insert Documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "bfa2ce18-b541-491e-8c6c-30b0c25b0523",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done indexing documents into `movies` index!\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from elasticsearch import helpers\n",
    "import time\n",
    " \n",
    "with open('movies.json') as f:\n",
    "   data_json = json.load(f)\n",
    "\n",
    "# Prepare the documents to be indexed\n",
    "documents = []\n",
    "for doc in data_json:\n",
    "    documents.append({\n",
    "        \"_index\": \"movies\",\n",
    "        \"_source\": doc,\n",
    "    })\n",
    "\n",
    "# Use helpers.bulk to index\n",
    "helpers.bulk(es, documents)\n",
    "\n",
    "print(\"Done indexing documents into `movies` index!\")\n",
    "time.sleep(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "511b660e-fb7f-4203-9f83-3e126ecbfbc2",
   "metadata": {},
   "source": [
    "# Create a new pipeline with ELSER"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "51297aad-1943-400e-9aac-c7402972cebe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True})"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "es.ingest.put_pipeline(\n",
    "    id=\"elser-ingest-pipeline\", \n",
    "    description=\"Ingest pipeline for ELSER\",\n",
    "    processors=[\n",
    "    {\n",
    "      \"inference\": {\n",
    "        \"model_id\": \".elser_model_2\",\n",
    "        \"input_output\": [\n",
    "            {\n",
    "              \"input_field\": \"plot\",\n",
    "              \"output_field\": \"plot_embedding\"\n",
    "            }\n",
    "          ]\n",
    "      }\n",
    "    }\n",
    "  ]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b3835ab7-07cd-4502-861c-d84cb84c007d",
   "metadata": {},
   "source": [
    "# Create a index with mappings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "91d71fc7-e0f4-4343-8253-760061e8ae2c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'elser-movies'})"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "es.indices.delete(index=\"elser-movies\",ignore_unavailable=True)\n",
    "es.indices.create(\n",
    "  index=\"elser-movies\",\n",
    "  mappings={\n",
    "    \"properties\": {\n",
    "      \"plot\": {\n",
    "        \"type\": \"text\",\n",
    "        \"fields\": {\n",
    "          \"keyword\": {\n",
    "            \"type\": \"keyword\",\n",
    "            \"ignore_above\": 256\n",
    "          }\n",
    "        }\n",
    "      },\n",
    "      \"plot_embedding\": { \n",
    "        \"type\": \"sparse_vector\" \n",
    "      }\n",
    "    }\n",
    "  }\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f460bf4a-5db1-4730-948e-3dad52cbef69",
   "metadata": {},
   "source": [
    "# Reindex with updated pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "7e8ac75c-3db0-46f4-97eb-c4b1de82a443",
   "metadata": {},
   "outputs": [],
   "source": [
    "es.reindex(source={\n",
    "    \"index\": \"movies\"\n",
    "  }, dest={\n",
    "    \"index\": \"elser-movies\",\n",
    "    \"pipeline\":  \"elser-ingest-pipeline\"\n",
    "  })\n",
    "time.sleep(7)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "773b5c16-44d9-446a-a3e0-7d82ce58ac7b",
   "metadata": {},
   "source": [
    "# Querying documents with ELSER"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "2335fc2e-7490-4045-bc33-28b2564cdf4b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Score: 6.4037457\n",
      "Title: Se7en\n",
      "Plot: Two detectives, a rookie and a veteran, hunt a serial killer who uses the seven deadly sins as his motives.\n",
      "\n",
      "Score: 3.6703415\n",
      "Title: The Departed\n",
      "Plot: An undercover cop and a mole in the police attempt to identify each other while infiltrating an Irish gang in South Boston.\n",
      "\n",
      "Score: 2.935915\n",
      "Title: The Usual Suspects\n",
      "Plot: A sole survivor tells of the twisty events leading up to a horrific gun battle on a boat, which began when five criminals met at a seemingly random police lineup.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "response = es.search(\n",
    "    index='elser-movies', \n",
    "    size=3,\n",
    "    query={\n",
    "        \"text_expansion\": {\n",
    "            \"plot_embedding\": {\n",
    "                \"model_id\":\".elser_model_2\",\n",
    "                \"model_text\":\"investigation\"\n",
    "            }\n",
    "        }\n",
    "    }\n",
    ")\n",
    "\n",
    "for hit in response['hits']['hits']:\n",
    "    doc_id = hit['_id']\n",
    "    score = hit['_score']\n",
    "    title = hit['_source']['title']\n",
    "    plot = hit['_source']['plot']\n",
    "    print(f\"Score: {score}\\nTitle: {title}\\nPlot: {plot}\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9a6ba11-5e2a-4db2-83e6-165f57bb207d",
   "metadata": {},
   "source": [
    "# Create a new ingestion pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "86b8dd66-84fa-4556-ab38-ebcbd12f9bbb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True})"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "es.ingest.put_pipeline(\n",
    "    id=\"elser-pipeline-upgrade-demo\", \n",
    "    description=\"Ingest pipeline for ELSER upgrade demo\",\n",
    "    processors=[\n",
    "    {\n",
    "      \"inference\": {\n",
    "        \"model_id\": \".elser_model_2\",\n",
    "        \"input_output\": [\n",
    "            {\n",
    "              \"input_field\": \"plot\",\n",
    "              \"output_field\": \"plot_embedding\"\n",
    "            }\n",
    "          ]\n",
    "      }\n",
    "    }\n",
    "  ]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4f89edf4-9c8e-4a9a-9725-34bf070192a8",
   "metadata": {},
   "source": [
    "# Create a new index with mappings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "ee3289fe-31b6-4b7a-beeb-ded43b9b91fb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'elser-upgrade-index-demo'})"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "es.indices.delete(index=\"elser-upgrade-index-demo\", ignore_unavailable=True)\n",
    "es.indices.create(\n",
    "  index=\"elser-upgrade-index-demo\",\n",
    "  mappings={\n",
    "    \"properties\": {\n",
    "      \"plot\": {\n",
    "        \"type\": \"text\",\n",
    "        \"fields\": {\n",
    "          \"keyword\": {\n",
    "            \"type\": \"keyword\",\n",
    "            \"ignore_above\": 256\n",
    "          }\n",
    "        }\n",
    "      },\n",
    "      \"plot_embedding\": {\n",
    "        \"type\": \"sparse_vector\"\n",
    "      },\n",
    "    }\n",
    "  }\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "482f30e9-daaa-4edd-b9cf-cc89c6e3c9ef",
   "metadata": {},
   "source": [
    "# Use Reindex API"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "619adf99-d783-4e2b-a02f-51eb459c5370",
   "metadata": {},
   "outputs": [],
   "source": [
    "es.reindex(source={\n",
    "    \"index\": \"elser-movies\", # replace with your index name\n",
    "    \"_source\": {\n",
    "      \"excludes\": [\"plot_embedding\"]  # replace with the field-name from your index, that has previously generated tokens\n",
    "    }}, \n",
    "    dest={\n",
    "    \"index\": \"elser-upgrade-index-demo\",\n",
    "    \"pipeline\":  \"elser-pipeline-upgrade-demo\"\n",
    "  })\n",
    "time.sleep(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "46f6d455-124f-490e-a141-226fd2f5e7fa",
   "metadata": {},
   "source": [
    "# Querying your data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "b12911d5-71e5-42be-b43d-3da81074a5fd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Score: 3.3168304\n",
      "Title: Fight Club\n",
      "Plot: An insomniac office worker and a devil-may-care soapmaker form an underground fight club that evolves into something much, much more.\n",
      "\n",
      "Score: 1.5777304\n",
      "Title: The Godfather\n",
      "Plot: An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son.\n",
      "\n",
      "Score: 1.1162583\n",
      "Title: The Matrix\n",
      "Plot: A computer hacker learns from mysterious rebels about the true nature of his reality and his role in the war against its controllers.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "response = es.search(\n",
    "    index='elser-upgrade-index-demo', \n",
    "    size=3,\n",
    "    query={\n",
    "        \"text_expansion\": {\n",
    "            \"plot_embedding\": {\n",
    "                \"model_id\":\".elser_model_2\",\n",
    "                \"model_text\":\"child toy\"\n",
    "            }\n",
    "        }\n",
    "    }\n",
    ")\n",
    "\n",
    "for hit in response['hits']['hits']:\n",
    "    doc_id = hit['_id']\n",
    "    score = hit['_score']\n",
    "    title = hit['_source']['title']\n",
    "    plot = hit['_source']['plot']\n",
    "    print(f\"Score: {score}\\nTitle: {title}\\nPlot: {plot}\\n\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "51c783af-3b0b-4ee3-8888-6c606a71a441",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'blogs': {'aliases': {}, 'mappings': {'properties': {'genre': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'keyScene': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'plot': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'released': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'runtime': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'text_embedding': {'properties': {'is_truncated': {'type': 'boolean'}, 'model_id': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'predicted_value': {'type': 'dense_vector', 'dims': 384, 'index': True, 'similarity': 'l2_norm'}}}, 'title': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}}, 'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}}, 'number_of_shards': '1', 'provided_name': 'blogs', 'default_pipeline': 'vectorize_blogs', 'creation_date': '1703827804965', 'number_of_replicas': '1', 'uuid': 'MqMj3t-OTN-jCL3wj1rP5g', 'version': {'created': '8500003'}}}}})"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "es.indices.get(index=\"blogs\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3533d4f3-b281-4193-a6ca-a069d387baeb",
   "metadata": {},
   "source": [
    "# Create ingestion pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "2f7d7605-ada7-40d9-9670-8cfdc6250905",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True})"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "es.ingest.put_pipeline(\n",
    "    id=\"elser-pipeline-blogs\", \n",
    "    description=\"Ingest pipeline for ELSER upgrade\",\n",
    "    processors=[\n",
    "    {\n",
    "      \"inference\": {\n",
    "        \"model_id\": \".elser_model_2\",\n",
    "        \"input_output\": [\n",
    "          {\n",
    "            \"input_field\": \"title\",\n",
    "            \"output_field\": \"title_embedding\"\n",
    "          }\n",
    "        ]\n",
    "      }\n",
    "    }\n",
    "  ]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "945472e9-ad58-4fe9-b772-7ce973ff1b51",
   "metadata": {},
   "source": [
    "# Create index with mappings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "9fa869b0-e259-4c47-96ac-eeec347308e2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'elser-blogs'})"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "es.indices.delete(index=\"elser-blogs\", ignore_unavailable=True)\n",
    "es.indices.create(\n",
    "  index=\"elser-blogs\",\n",
    "  mappings={\n",
    "    \"properties\": {\n",
    "      \"title\": {\n",
    "        \"type\": \"text\",\n",
    "        \"fields\": {\n",
    "          \"keyword\": {\n",
    "            \"type\": \"keyword\",\n",
    "            \"ignore_above\": 256\n",
    "          }\n",
    "        }\n",
    "      },\n",
    "      \"title_embedding\": {\n",
    "        \"type\": \"sparse_vector\"\n",
    "      },\n",
    "    }\n",
    "  }\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c8142ddc-a877-421e-a4c0-32b3342e0803",
   "metadata": {},
   "source": [
    "# Reindex API"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "7b07531c-136e-4c9f-b790-3a027916f4a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "es.reindex(source={\n",
    "    \"index\": \"blogs\",\n",
    "    \"_source\": {\n",
    "      \"excludes\": [\"text_embedding\"]\n",
    "    }\n",
    "  }, dest={\n",
    "    \"index\": \"elser-blogs\",\n",
    "    \"pipeline\":  \"elser-pipeline-blogs\"\n",
    "  })\n",
    "time.sleep(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bc6d46af-b1e7-4af4-abc7-59fdffcfc8ec",
   "metadata": {},
   "source": [
    "# Querying your data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "de2a8809-9daf-4118-91d2-0dde0a3553cc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Score: 0.15847498\n",
      "Title: Se7en\n",
      "Score: 0.048573207\n",
      "Title: Inception\n",
      "Score: 0.0070162327\n",
      "Title: The Departed\n"
     ]
    }
   ],
   "source": [
    "response = es.search(\n",
    "    index='elser-blogs', \n",
    "    size=3,\n",
    "    query={\n",
    "        \"text_expansion\": {\n",
    "            \"title_embedding\": {\n",
    "                \"model_id\":\".elser_model_2\",\n",
    "                \"model_text\":\"Track network connections\"\n",
    "            }\n",
    "        }\n",
    "    }\n",
    ")\n",
    "\n",
    "for hit in response['hits']['hits']:\n",
    "    doc_id = hit['_id']\n",
    "    score = hit['_score']\n",
    "    title = hit['_source']['title']\n",
    "    print(f\"Score: {score}\\nTitle: {title}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c614c69-478e-4c07-89f1-7a08aa2c4601",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python_3.11",
   "language": "python",
   "name": "python_3.11"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
